{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 准备数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "from tensorflow.keras import layers, optimizers, datasets\n",
    "\n",
    "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # or any {'0', '1', '2'}\n",
    "\n",
    "def mnist_dataset():\n",
    "    (x, y), (x_test, y_test) = datasets.mnist.load_data()\n",
    "    #normalize\n",
    "    x = x/255.0\n",
    "    x_test = x_test/255.0\n",
    "    \n",
    "    return (x, y), (x_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demo numpy based auto differentiation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "class Matmul:\n",
    "    def __init__(self):\n",
    "        self.mem = {}\n",
    "        \n",
    "    def forward(self, x, W):\n",
    "        h = np.matmul(x, W)\n",
    "        self.mem={'x': x, 'W':W}\n",
    "        return h\n",
    "    \n",
    "    def backward(self, grad_y):\n",
    "        '''\n",
    "        x: shape(N, d)\n",
    "        w: shape(d, d')\n",
    "        grad_y: shape(N, d')\n",
    "        '''\n",
    "        x = self.mem['x']\n",
    "        W = self.mem['W']\n",
    "        \n",
    "        ####################\n",
    "        '''计算矩阵乘法的对应的梯度'''\n",
    "        ####################\n",
    "        return grad_x, grad_W\n",
    "\n",
    "\n",
    "class Relu:\n",
    "    def __init__(self):\n",
    "        self.mem = {}\n",
    "        \n",
    "    def forward(self, x):\n",
    "        self.mem['x']=x\n",
    "        return np.where(x > 0, x, np.zeros_like(x))\n",
    "    \n",
    "    def backward(self, grad_y):\n",
    "        '''\n",
    "        grad_y: same shape as x\n",
    "        '''\n",
    "        ####################\n",
    "        '''计算relu 激活函数对应的梯度'''\n",
    "        ####################\n",
    "        return grad_x\n",
    "    \n",
    "\n",
    "\n",
    "class Softmax:\n",
    "    '''\n",
    "    softmax over last dimention\n",
    "    '''\n",
    "    def __init__(self):\n",
    "        self.epsilon = 1e-12\n",
    "        self.mem = {}\n",
    "        \n",
    "    def forward(self, x):\n",
    "        '''\n",
    "        x: shape(N, c)\n",
    "        '''\n",
    "        x_exp = np.exp(x)\n",
    "        partition = np.sum(x_exp, axis=1, keepdims=True)\n",
    "        out = x_exp/(partition+self.epsilon)\n",
    "        \n",
    "        self.mem['out'] = out\n",
    "        self.mem['x_exp'] = x_exp\n",
    "        return out\n",
    "    \n",
    "    def backward(self, grad_y):\n",
    "        '''\n",
    "        grad_y: same shape as x\n",
    "        '''\n",
    "        s = self.mem['out']\n",
    "        sisj = np.matmul(np.expand_dims(s,axis=2), np.expand_dims(s, axis=1)) # (N, c, c)\n",
    "        g_y_exp = np.expand_dims(grad_y, axis=1)\n",
    "        tmp = np.matmul(g_y_exp, sisj) #(N, 1, c)\n",
    "        tmp = np.squeeze(tmp, axis=1)\n",
    "        tmp = -tmp+grad_y*s \n",
    "        return tmp\n",
    "    \n",
    "class Log:\n",
    "    '''\n",
    "    softmax over last dimention\n",
    "    '''\n",
    "    def __init__(self):\n",
    "        self.epsilon = 1e-12\n",
    "        self.mem = {}\n",
    "        \n",
    "    def forward(self, x):\n",
    "        '''\n",
    "        x: shape(N, c)\n",
    "        '''\n",
    "        out = np.log(x+self.epsilon)\n",
    "        \n",
    "        self.mem['x'] = x\n",
    "        return out\n",
    "    \n",
    "    def backward(self, grad_y):\n",
    "        '''\n",
    "        grad_y: same shape as x\n",
    "        '''\n",
    "        x = self.mem['x']\n",
    "        \n",
    "        return 1./(x+1e-12) * grad_y\n",
    "    \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Gradient check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import tensorflow as tf\n",
    "\n",
    "# x = np.random.normal(size=[5, 6])\n",
    "# W = np.random.normal(size=[6, 4])\n",
    "# aa = Matmul()\n",
    "# out = aa.forward(x, W) # shape(5, 4)\n",
    "# grad = aa.backward(np.ones_like(out))\n",
    "# print (grad)\n",
    "\n",
    "# with tf.GradientTape() as tape:\n",
    "#     x, W = tf.constant(x), tf.constant(W)\n",
    "#     tape.watch(x)\n",
    "#     y = tf.matmul(x, W)\n",
    "#     loss = tf.reduce_sum(y)\n",
    "#     grads = tape.gradient(loss, x)\n",
    "#     print (grads)\n",
    "\n",
    "# import tensorflow as tf\n",
    "\n",
    "# x = np.random.normal(size=[5, 6])\n",
    "# aa = Relu()\n",
    "# out = aa.forward(x) # shape(5, 4)\n",
    "# grad = aa.backward(np.ones_like(out))\n",
    "# print (grad)\n",
    "\n",
    "# with tf.GradientTape() as tape:\n",
    "#     x= tf.constant(x)\n",
    "#     tape.watch(x)\n",
    "#     y = tf.nn.relu(x)\n",
    "#     loss = tf.reduce_sum(y)\n",
    "#     grads = tape.gradient(loss, x)\n",
    "#     print (grads)\n",
    "\n",
    "# import tensorflow as tf\n",
    "# x = np.random.normal(size=[5, 6], scale=5.0, loc=1)\n",
    "# label = np.zeros_like(x)\n",
    "# label[0, 1]=1.\n",
    "# label[1, 0]=1\n",
    "# label[1, 1]=1\n",
    "# label[2, 3]=1\n",
    "# label[3, 5]=1\n",
    "# label[4, 0]=1\n",
    "# print(label)\n",
    "# aa = Softmax()\n",
    "# out = aa.forward(x) # shape(5, 6)\n",
    "# grad = aa.backward(label)\n",
    "# print (grad)\n",
    "\n",
    "# with tf.GradientTape() as tape:\n",
    "#     x= tf.constant(x)\n",
    "#     tape.watch(x)\n",
    "#     y = tf.nn.softmax(x)\n",
    "#     loss = tf.reduce_sum(y*label)\n",
    "#     grads = tape.gradient(loss, x)\n",
    "#     print (grads)\n",
    "\n",
    "# import tensorflow as tf\n",
    "\n",
    "# x = np.random.normal(size=[5, 6])\n",
    "# aa = Log()\n",
    "# out = aa.forward(x) # shape(5, 4)\n",
    "# grad = aa.backward(label)\n",
    "# print (grad)\n",
    "\n",
    "# with tf.GradientTape() as tape:\n",
    "#     x= tf.constant(x)\n",
    "#     tape.watch(x)\n",
    "#     y = tf.math.log(x)\n",
    "#     loss = tf.reduce_sum(y*label)\n",
    "#     grads = tape.gradient(loss, x)\n",
    "#     print (grads)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Final Gradient Check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[  0.          29.88862875   0.           0.           0.\n",
      "    0.        ]\n",
      " [363.02160168   0.           0.           0.           0.\n",
      "    0.        ]\n",
      " [  0.           0.           0.          12.25561872   0.\n",
      "    0.        ]\n",
      " [  0.           0.           0.           0.           0.\n",
      "   53.54971193]\n",
      " [689.96870449   0.           0.           0.           0.\n",
      "    0.        ]]\n",
      "----------------------------------------\n",
      "[[  0.          29.88862875   0.           0.           0.\n",
      "    0.        ]\n",
      " [363.02160181   0.           0.           0.           0.\n",
      "    0.        ]\n",
      " [  0.           0.           0.          12.25561872   0.\n",
      "    0.        ]\n",
      " [  0.           0.           0.           0.           0.\n",
      "   53.54971193]\n",
      " [689.96870497   0.           0.           0.           0.\n",
      "    0.        ]]\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "\n",
    "label = np.zeros_like(x)\n",
    "label[0, 1]=1.\n",
    "label[1, 0]=1\n",
    "label[2, 3]=1\n",
    "label[3, 5]=1\n",
    "label[4, 0]=1\n",
    "\n",
    "x = np.random.normal(size=[5, 6])\n",
    "W1 = np.random.normal(size=[6, 5])\n",
    "W2 = np.random.normal(size=[5, 6])\n",
    "\n",
    "mul_h1 = Matmul()\n",
    "mul_h2 = Matmul()\n",
    "relu = Relu()\n",
    "softmax = Softmax()\n",
    "log = Log()\n",
    "\n",
    "h1 = mul_h1.forward(x, W1) # shape(5, 4)\n",
    "h1_relu = relu.forward(h1)\n",
    "h2 = mul_h2.forward(h1_relu, W2)\n",
    "h2_soft = softmax.forward(h2)\n",
    "h2_log = log.forward(h2_soft)\n",
    "\n",
    "\n",
    "h2_log_grad = log.backward(label)\n",
    "h2_soft_grad = softmax.backward(h2_log_grad)\n",
    "h2_grad, W2_grad = mul_h2.backward(h2_soft_grad)\n",
    "h1_relu_grad = relu.backward(h2_grad)\n",
    "h1_grad, W1_grad = mul_h1.backward(h1_relu_grad)\n",
    "\n",
    "print(h2_log_grad)\n",
    "print('--'*20)\n",
    "# print(W2_grad)\n",
    "\n",
    "with tf.GradientTape() as tape:\n",
    "    x, W1, W2, label = tf.constant(x), tf.constant(W1), tf.constant(W2), tf.constant(label)\n",
    "    tape.watch(W1)\n",
    "    tape.watch(W2)\n",
    "    h1 = tf.matmul(x, W1)\n",
    "    h1_relu = tf.nn.relu(h1)\n",
    "    h2 = tf.matmul(h1_relu, W2)\n",
    "    prob = tf.nn.softmax(h2)\n",
    "    log_prob = tf.math.log(prob)\n",
    "    loss = tf.reduce_sum(label * log_prob)\n",
    "    grads = tape.gradient(loss, [prob])\n",
    "    print (grads[0].numpy())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 建立模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "class myModel:\n",
    "    def __init__(self):\n",
    "        \n",
    "        self.W1 = np.random.normal(size=[28*28+1, 100])\n",
    "        self.W2 = np.random.normal(size=[100, 10])\n",
    "        \n",
    "        self.mul_h1 = Matmul()\n",
    "        self.mul_h2 = Matmul()\n",
    "        self.relu = Relu()\n",
    "        self.softmax = Softmax()\n",
    "        self.log = Log()\n",
    "        \n",
    "        \n",
    "    def forward(self, x):\n",
    "        x = x.reshape(-1, 28*28)\n",
    "        bias = np.ones(shape=[x.shape[0], 1])\n",
    "        x = np.concatenate([x, bias], axis=1)\n",
    "        \n",
    "        self.h1 = self.mul_h1.forward(x, self.W1) # shape(5, 4)\n",
    "        self.h1_relu = self.relu.forward(self.h1)\n",
    "        self.h2 = self.mul_h2.forward(self.h1_relu, self.W2)\n",
    "        self.h2_soft = self.softmax.forward(self.h2)\n",
    "        self.h2_log = self.log.forward(self.h2_soft)\n",
    "            \n",
    "    def backward(self, label):\n",
    "        self.h2_log_grad = self.log.backward(-label)\n",
    "        self.h2_soft_grad = self.softmax.backward(self.h2_log_grad)\n",
    "        self.h2_grad, self.W2_grad = self.mul_h2.backward(self.h2_soft_grad)\n",
    "        self.h1_relu_grad = self.relu.backward(self.h2_grad)\n",
    "        self.h1_grad, self.W1_grad = self.mul_h1.backward(self.h1_relu_grad)\n",
    "        \n",
    "model = myModel()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 计算 loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_loss(log_prob, labels):\n",
    "     return np.mean(np.sum(-log_prob*labels, axis=1))\n",
    "    \n",
    "\n",
    "def compute_accuracy(log_prob, labels):\n",
    "    predictions = np.argmax(log_prob, axis=1)\n",
    "    truth = np.argmax(labels, axis=1)\n",
    "    return np.mean(predictions==truth)\n",
    "\n",
    "def train_one_step(model, x, y):\n",
    "    model.forward(x)\n",
    "    model.backward(y)\n",
    "    model.W1 -= 1e-5* model.W1_grad\n",
    "    model.W2 -= 1e-5* model.W2_grad\n",
    "    loss = compute_loss(model.h2_log, y)\n",
    "    accuracy = compute_accuracy(model.h2_log, y)\n",
    "    return loss, accuracy\n",
    "\n",
    "def test(model, x, y):\n",
    "    model.forward(x)\n",
    "    loss = compute_loss(model.h2_log, y)\n",
    "    accuracy = compute_accuracy(model.h2_log, y)\n",
    "    return loss, accuracy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 实际训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 0 : loss 24.72223286454846 ; accuracy 0.07413333333333333\n",
      "epoch 1 : loss 23.924144333646588 ; accuracy 0.1002\n",
      "epoch 2 : loss 23.024883096483002 ; accuracy 0.1252\n",
      "epoch 3 : loss 21.789798024024133 ; accuracy 0.16403333333333334\n",
      "epoch 4 : loss 20.651041530819974 ; accuracy 0.209\n",
      "epoch 5 : loss 19.77011018744175 ; accuracy 0.23966666666666667\n",
      "epoch 6 : loss 18.820253731189734 ; accuracy 0.26993333333333336\n",
      "epoch 7 : loss 17.89411614473916 ; accuracy 0.3019\n",
      "epoch 8 : loss 17.241523788018537 ; accuracy 0.32666666666666666\n",
      "epoch 9 : loss 16.750174736771154 ; accuracy 0.34513333333333335\n",
      "epoch 10 : loss 16.34424433437691 ; accuracy 0.36118333333333336\n",
      "epoch 11 : loss 15.995649918471548 ; accuracy 0.37455\n",
      "epoch 12 : loss 15.686255253464532 ; accuracy 0.38605\n",
      "epoch 13 : loss 15.398560292015457 ; accuracy 0.39716666666666667\n",
      "epoch 14 : loss 15.108667168656357 ; accuracy 0.40665\n",
      "epoch 15 : loss 14.768259637386473 ; accuracy 0.41781666666666667\n",
      "epoch 16 : loss 14.336060964222801 ; accuracy 0.42795\n",
      "epoch 17 : loss 13.800404674290487 ; accuracy 0.4438166666666667\n",
      "epoch 18 : loss 13.238676768175882 ; accuracy 0.45813333333333334\n",
      "epoch 19 : loss 12.275000859496435 ; accuracy 0.49001666666666666\n",
      "epoch 20 : loss 11.789863414935745 ; accuracy 0.5096833333333334\n",
      "epoch 21 : loss 11.099507177500909 ; accuracy 0.5359833333333334\n",
      "epoch 22 : loss 10.947845925466538 ; accuracy 0.5452\n",
      "epoch 23 : loss 10.37985058714904 ; accuracy 0.5686333333333333\n",
      "epoch 24 : loss 10.194234392408456 ; accuracy 0.5764666666666667\n",
      "epoch 25 : loss 9.9158980895203 ; accuracy 0.5883666666666667\n",
      "epoch 26 : loss 9.73326221836977 ; accuracy 0.59515\n",
      "epoch 27 : loss 9.547157941634678 ; accuracy 0.6050833333333333\n",
      "epoch 28 : loss 9.364149820809562 ; accuracy 0.61045\n",
      "epoch 29 : loss 9.241444692225024 ; accuracy 0.6171333333333333\n",
      "epoch 30 : loss 9.033834821879237 ; accuracy 0.6230166666666667\n",
      "epoch 31 : loss 8.909350423734871 ; accuracy 0.62905\n",
      "epoch 32 : loss 8.686975386903871 ; accuracy 0.6366\n",
      "epoch 33 : loss 8.52761694967565 ; accuracy 0.6425166666666666\n",
      "epoch 34 : loss 8.280276344997858 ; accuracy 0.6510666666666667\n",
      "epoch 35 : loss 8.017204559951352 ; accuracy 0.6591833333333333\n",
      "epoch 36 : loss 7.6927551641724925 ; accuracy 0.6705333333333333\n",
      "epoch 37 : loss 7.289147376861199 ; accuracy 0.6818166666666666\n",
      "epoch 38 : loss 6.890080011599251 ; accuracy 0.69715\n",
      "epoch 39 : loss 6.621282824275839 ; accuracy 0.70885\n",
      "epoch 40 : loss 6.432028811711102 ; accuracy 0.71685\n",
      "epoch 41 : loss 6.297568845936132 ; accuracy 0.7227666666666667\n",
      "epoch 42 : loss 6.199755928239367 ; accuracy 0.72755\n",
      "epoch 43 : loss 6.1290342585715685 ; accuracy 0.73055\n",
      "epoch 44 : loss 6.063146731968448 ; accuracy 0.7336166666666667\n",
      "epoch 45 : loss 6.032675366835999 ; accuracy 0.7359\n",
      "epoch 46 : loss 5.960594129095456 ; accuracy 0.73785\n",
      "epoch 47 : loss 5.943588768247914 ; accuracy 0.73995\n",
      "epoch 48 : loss 5.8297749382365 ; accuracy 0.7436666666666667\n",
      "epoch 49 : loss 5.800353627628091 ; accuracy 0.7465833333333334\n",
      "test loss 5.590955869790617 ; accuracy 0.7588\n"
     ]
    }
   ],
   "source": [
    "train_data, test_data = mnist_dataset()\n",
    "train_label = np.zeros(shape=[train_data[0].shape[0], 10])\n",
    "test_label = np.zeros(shape=[test_data[0].shape[0], 10])\n",
    "train_label[np.arange(train_data[0].shape[0]), np.array(train_data[1])] = 1.\n",
    "test_label[np.arange(test_data[0].shape[0]), np.array(test_data[1])] = 1.\n",
    "\n",
    "for epoch in range(50):\n",
    "    loss, accuracy = train_one_step(model, train_data[0], train_label)\n",
    "    print('epoch', epoch, ': loss', loss, '; accuracy', accuracy)\n",
    "loss, accuracy = test(model, test_data[0], test_label)\n",
    "\n",
    "print('test loss', loss, '; accuracy', accuracy)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
